package resolver.analyzer

import java.util.regex.Pattern
import org.apache.log4j.Logger

/**
 * anlayze the text by regular expressions
 */
class SimpleTextAnalyzer implements Analyzer {

  /**
   * internal logger
   */
  private Logger logger = Logger.getLogger("analyzer")

  /**
   * has some additional analyzers to executed first
   */
  private List<Analyzer> internalAnalyzer = new ArrayList<Analyzer>()
  /**
   * default classes used for discovery
   */
  List classes = [
          /acid/,
          /anhydride/,
          /benzoate/,
          /ketone/,
          /ether/,
          /ester/,
          /cyanide/,
          /oxi[md]e/,
          /sacetate/,
          /alcohol/,
          /\w+sulfide/,
          /iodide/,
          /succinate/,
          /sulfonate/,
          /amide/,
          /sodium/,
          /salt/,
          /\w+aldehyde/,
          /methiodide/
  ]

  /**
   * recompiles the pattern
   * @return
   */
  public recompilePattern() {
    this.pattern = buildPattern()
  }

  /**
   * our reg expression string from the list of classes
   * @return
   */
  protected Pattern buildPattern() {

    StringBuffer result = new StringBuffer()

    //matches vitamins
    result.append(/(\b\w*vitamin\w{0,2}[\s\-]\w\b)/)

    //or we match
    result.append(/|(/)

    //or we match cases like (+)- or (R)- or (R2,S2)- wich can exist once or more
    result.append(/(\([\w+\)]+(,\w+)*\)-)?/)

    //or we look for complete word boundaries
    result.append(/\b/)

    //we want to have one of the following possibilities
    result.append(/[/)

    //look for stuff like 1- or A- or 1,2- or 1,2,3,4,5,6- which can exist at least 0 times or as often as it once
    result.append(/(\w+(,\w+[\'+])*\-)*/)

    //look for stuff like ~5~
    result.append(/(~\w~)*/)

    //accept every possible character in {} or () or [] or a combination of those
    result.append(/((\[.*\])|(\(.*\))|(\{.*\}))*/)

    //we exspect at least on word char to follow
    result.append(/w+/)

    //the hole expression has to matched atleast once but can happen as often as it wants
    result.append(/]+/)

    //end the world statemetns
    result.append(/(\b)/)

    // match our classes
    result.append("(")

    //go over our classes and append them
    classes.eachWithIndex {String s, int index ->

      result.append(/(/)
      result.append(/\s/)

      result.append(s)
      result.append(/)/)

      //we append an or constract till the end -1
      if (index < classes.size() - 1) {
        result.append(/|/)
      }
    }

    //we accept between 0 and 2 classes
    result.append(/){0,2}/)

    //our result can end with a bracket, but it doesn't have to
    result.append(/\)*/)

    //close our second statemetn
    result.append(")")

    logger.info "compiled pattern: ${result.toString()} "

    return Pattern.compile(result.toString(), Pattern.CASE_INSENSITIVE)
  }

  /**
   * our pattern to match for chemicals
   * which will be matched case insensitve
   * you can find some examples in the test case what matches
   *
   */
  def pattern = buildPattern()

  /**
   * analyzes the text and trys to find chemicals
   * @param text
   * @return
   */
  Set<String> analyze(def value) {
    Set<String> result = new HashSet<String>()

    if (value instanceof String | value instanceof InputStream | value instanceof File) {
      value.eachLine { String text ->

        //pre analyzer

        internalAnalyzer.each {Analyzer analyzer ->
          if (analyzer instanceof SimpleTextAnalyzer == false) {
            Set<String> res = analyzer.analyze(text)

            res.each {String s ->
              logger.info "removing ${s} from the analzying text"
              text = text.replaceAll(s, " ")

              result.add(s)
            }

          }
          else {
            logger.info "it does not make much sense in pre analyzing with the same analyzer ${analyzer}"
          }
        }
        //save all matches in the match
        def match = (text =~ pattern)

        //saves the result
        match.each {List word ->

          cleanupMatches(word[0], result)
        }

      }
    }
    else {
      throw new InternalError("please provide an argument which is string of file or inputstream")
    }
    return result;
  }

  /**
   * cleans up all the word and adds it to the result in the result
   */
  protected def cleanupMatches(def w, Set<String> result) {

    //only want to process words longer than 0
    if (w.size() > 0) {
      //replace wrong brackets
      if (w.endsWith(")")) {
        int countLeft = w.count("(")
        int countRight = w.count(")")

        if (countRight > countLeft) {
          w = w.substring(0, w.size() - 1)
        }
      }

      if (!w.matches(/\d+/)) {
        //add the word to the result list
        result.add(w)
      }
    }
  }

  /**
   *
   build our list of internal analyzers* @return 
   */
  public SimpleTextAnalyzer() {
    this.internalAnalyzer.add AnalyzerFactory.getInchiCodeAnalyzer()
    this.internalAnalyzer.add AnalyzerFactory.getCasAnalyzer()
    this.internalAnalyzer.add AnalyzerFactory.getKeggAnalyzer()
    this.internalAnalyzer.add AnalyzerFactory.getInchiHashKeyAnalyzer()
    this.internalAnalyzer.add AnalyzerFactory.getHMDBAnalyzer()
  }
}
